When removing the gene ARHGAP11B (ENSG00000187951) there was an important change in the WGCNA modules, and when analysing what could have caused this, we found that this gene had a very big outlier value for a single sample, which we think altered the values of the other genes during the vst transformation enough to alter the modules by the WGNCA algorithm (See 20_02_28_comparison.html for more details about this).

Since the preprocessing pipeline did not catch this behaviour and it ended up causing problems in downstream analysis, we want to see how often we can find these type of behaviours in the data and if/how much they alter the results of WGCNA modules.

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(WGCNA)
library(expss)
library(knitr)
library(polycor)
# Load preprocessed data to get DE information and the genes that weren't filtered during preprocessing
load('./../../AllRegions/Data/filtered_raw_data_old.RData')
datExpr_filtered = datExpr
datGenes_filtered = datGenes

load('./../../AllRegions/Data/preprocessed_data.RData')
DE_info = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'DE' = padj<0.05) %>%
          dplyr::select(ID, log2FoldChange, padj, DE)


# Load original expression datasets
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')


datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>% 
                      mutate(Brain_lobe = case_when(Brain_Region == 'BA4_6' ~ 'Frontal',
                                                    Brain_Region == 'BA7'   ~ 'Parietal',
                                                    Brain_Region == 'BA38'  ~ 'Temporal',
                                                    TRUE ~ 'Occipital')) %>%
                      mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)), 
                             Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>% 
                      dplyr::select(-Diagnosis_)

# Filtering genes: These filters would be the same, so I'll just keep the genes present in the 
# filtered dataset instead of repeating the whole filtering
datExpr = datExpr[rownames(datExpr) %in% rownames(datExpr_filtered),]
datGenes = datGenes_filtered %>% filter(ensembl_gene_id %in% rownames(datExpr_filtered))

rm(datExpr_filtered, datGenes_filtered, dds)

Original z-score metric


\(metric_i = max_j \frac{|x_{i,j} - mean(x_i)|}{sd(x_i)}\)

z_scores = datExpr %>% apply(1, function(x) abs(x-mean(x))/sd(x)) %>% t %>% data.frame


Outlier genes


Genes with the highest z-score in any of its entries

max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max), 
                          'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$Sample_ID[which.max(x)])) %>%
               left_join(DE_info, by = 'ID')
                            

ENSG00000187951_val = max_z_scores[max_z_scores$ID == 'ENSG00000187951',2]

cat(paste0('Gene ARHGAP11B has a value of ', round(ENSG00000187951_val,1)))
## Gene ARHGAP11B has a value of 9.2
cat(paste0('There are ', sum(max_z_scores$max_z_score>ENSG00000187951_val),
           ' genes with a value higher than this (~',
           round(100*mean(max_z_scores$max_z_score>ENSG00000187951_val),2),'%)'))
## There are 4 genes with a value higher than this (~0.02%)

The dotted line indicates the value for the gene we removed ARHGAP11B (ENSG00000187951)

summary(max_z_scores$max_z_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.792   2.804   3.409   3.832   4.487   9.265
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) + 
                           geom_hline(yintercept = ENSG00000187951_val+1, color='gray', linetype = 'dotted') + 
                           xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
                           ggtitle('Max|z-score| value for each gene') +
                           theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                 legend.position='none', panel.grid.major = element_blank())

ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

rm(p)

Top 20 genes

  • These genes have more than one outlier sample

  • All of the outliers corres pond to ASD samples

  • Standardising the max_z_score value of each gene, all of the 20 top genes would be considered outliers (more than 2 sd from the average max_z_score value)

plot_function = function(i){
  i = 3*i-2
  plot_list = list()
  for(j in 1:3){
    plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
                           'Diagnosis' = datMeta$Diagnosis)
    colnames(plot_data)[2] = 'expr'
    plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) + 
                              geom_point() + theme_minimal() + 
                              theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.1 , y = 1.05, text =  top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
            mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
            dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)

kable(top_genes, caption='Top 20 genes with the highest max z-score value')
Top 20 genes with the highest max z-score value
ID hgnc_symbol log2FoldChange padj DE max_z_score StandMaxZscore
ENSG00000102802 MEDAG 0.0924838 0.7431112 FALSE 9.265417 3.949315
ENSG00000140285 FGF7 0.8934000 0.0061764 TRUE 9.257698 3.943704
ENSG00000113083 LOX 0.4987464 0.0092761 TRUE 9.252452 3.939891
ENSG00000166741 NNMT 0.0937272 0.6452867 FALSE 9.242967 3.932996
ENSG00000187951 ARHGAP11B NA NA NA 9.235620 3.927656
ENSG00000170577 SIX2 1.2910712 0.0142510 TRUE 9.235302 3.927425
ENSG00000180447 GAS1 0.2830335 0.0037888 TRUE 9.230353 3.923827
ENSG00000087245 MMP2 0.4116331 0.0009957 TRUE 9.227912 3.922053
ENSG00000106483 SFRP4 0.5635852 0.0143759 TRUE 9.215450 3.912995
ENSG00000181541 MAB21L2 0.5391928 0.3724211 FALSE 9.197313 3.899811
ENSG00000115461 IGFBP5 0.5291559 0.0000252 TRUE 9.161680 3.873910
ENSG00000197614 MFAP5 0.6564396 0.0723218 FALSE 9.160145 3.872795
ENSG00000164220 F2RL2 0.6499642 0.0886975 FALSE 9.153819 3.868196
ENSG00000145423 SFRP2 0.5797932 0.0247518 TRUE 9.138815 3.857290
ENSG00000196092 PAX5 0.6662045 0.0779792 FALSE 9.134951 3.854482
ENSG00000149380 P4HA3 0.0834326 0.8398095 FALSE 9.107004 3.834168
ENSG00000165646 SLC18A2 -0.4914993 0.2622890 FALSE 9.098293 3.827836
ENSG00000159167 STC1 1.0011532 0.0000165 TRUE 9.093926 3.824661
ENSG00000122176 FMOD 0.4601202 0.0958029 FALSE 9.089579 3.821501
ENSG00000168542 COL3A1 1.2491085 0.0000001 TRUE 9.084016 3.817458
ENSG00000139899 CBLN3 0.4478234 0.0619673 FALSE 9.077338 3.812604
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)


Outlier samples


If there was no dependence between the samples and the outliers, then the maximum |z-scores| would be evenly distributed along the 88 samples, with ~183 maximum z-scores in each sample

  • The highest counts are grouped around a set of samples instead of being uniformly distributed

  • The patient the samples belong to seems to also play a small role (for example, AN02987 or AN07176)

  • Most samples with the highest counts belong to the ASD group

  • Brain Region doesn’t seem to play a significant role in this

  • Samples with the highest counts come from both Batches (this information is not included in the plot)


Defining outlier samples: Samples with a standardised Count > 2 (everything above the dotted line)

samples_info = max_z_scores$outlier_sample %>% table %>% data.frame %>% 
               dplyr::rename(Sample_ID = '.', Count = 'Freq') %>%
               left_join(datMeta, by = 'Sample_ID')  %>% arrange(desc(Count)) %>% 
               mutate('StandardisedCount' = round((Count-mean(Count))/sd(Count),2)) %>%
               dplyr::select(Sample_ID, Count, StandardisedCount, Subject_ID, Sex, Age, Brain_lobe, Batch, Diagnosis)

ggplotly(samples_info %>% ggplot(aes(Subject_ID, Count, fill=Diagnosis, color=Brain_lobe)) + 
         geom_bar(stat = 'identity', size = 0, position = position_dodge2(preserve = 'single')) + xlab('Subjects') +
         geom_hline(yintercept = nrow(max_z_scores)/nrow(datMeta), color = 'gray') + 
         geom_hline(yintercept = mean(samples_info$Count)+2*sd(samples_info$Count), color = 'gray', linetype = 'dotted') + 
         ggtitle('Number of genes for which their maximum |z-score| value fell in each Sample') + 
         theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1),
                                 legend.position = 'none'))
kable(samples_info %>% filter(Count>nrow(max_z_scores)/nrow(datMeta)), 
      caption = 'Samples with more max z-scores than expected by random assignment')
Samples with more max z-scores than expected by random assignment
Sample_ID Count StandardisedCount Subject_ID Sex Age Brain_lobe Batch Diagnosis
AN01971_BA38 1854 4.77 AN01971 M 39 Temporal 10.10.2014 ASD
AN13872_BA4_6 1806 4.63 AN13872 F 5 Frontal 6.20.2014 ASD
AN02987_BA38 1170 2.82 AN02987 M 15 Temporal 10.10.2014 ASD
AN09714_BA38 1031 2.42 AN09714 M 60 Temporal 10.10.2014 ASD
AN11796_BA7 902 2.05 AN11796 M 14 Parietal 6.20.2014 ASD
AN14757_BA17 749 1.61 AN14757 M 24 Occipital 10.10.2014 CTL
AN02987_BA7 742 1.59 AN02987 M 15 Parietal 10.10.2014 ASD
AN07176_BA4_6 716 1.52 AN07176 M 21 Frontal 10.10.2014 CTL
AN02987_BA17 685 1.43 AN02987 M 15 Occipital 10.10.2014 ASD
AN00493_BA17 513 0.94 AN00493 M 27 Occipital 10.10.2014 ASD
AN01093_BA7 475 0.83 AN01093 M 56 Parietal 10.10.2014 ASD
AN08873_BA17 461 0.79 AN08873 M 5 Occipital 10.10.2014 ASD
AN07176_BA38 436 0.72 AN07176 M 21 Temporal 10.10.2014 CTL
AN00493_BA4_6 337 0.44 AN00493 M 27 Frontal 6.20.2014 ASD
AN17254_BA17 298 0.33 AN17254 M 51 Occipital 6.20.2014 ASD
AN11796_BA38 278 0.27 AN11796 M 14 Temporal 10.10.2014 ASD
AN17515_BA38 252 0.20 AN17515 M 54 Temporal 10.10.2014 ASD
AN01971_BA17 197 0.04 AN01971 M 39 Occipital 10.10.2014 ASD
AN10723_BA38 186 0.01 AN10723 M 60 Temporal 10.10.2014 CTL


To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution

The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|

plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]

# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
  outlier_idx = which(datMeta$Sample_ID==s)
  z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
  plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)

# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$Sample_ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$Sample_ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$Sample_ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples AN04166_BA38, AN04479_BA4_6, AN17254_BA38 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))

# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
           'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
                                      'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
                                      'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
                 melt()  %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
                 mutate(Sample_ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
                                              variable == 'Random Sample 2' ~ rand_samp_2,
                                              variable == 'Random Sample 3' ~ rand_samp_3,
                                              TRUE ~ as.character(variable))) %>%
                 left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis), by = 'Sample_ID')

# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
         xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
   rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)


Metric currently used to filter samples vs z-score


Currently used metric:

  • Create weighted sample correlation network

  • Calculate the connectivity of each node

  • Normalise connectivity of the nodes (samples)

  • Filter nodes (samples) with a connectivity lower than -2 (low connectivity, with a distance larger than 2 sd to the mean connectivity of the network)


Notes:

  • All of the outlier samples belong to ASD patients

  • Both methods agree on some samples, but others, such as AN13872_BA4_6, which the 2nd highest number of max |z-score| wouldn’t have been picked up by the original method

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

original_z_score = data.frame('ID' = gsub('X','',names(z.ku)), 'z_score_orig' = as.vector(z.ku)) %>%
                   left_join(datMeta, by = c('ID' = 'Dissected_Sample_ID')) %>% 
                   dplyr::select(Sample_ID, z_score_orig)

plot_data = samples_info %>% left_join(original_z_score, by = 'Sample_ID') %>% 
            mutate('norm_count' = (Count-mean(Count))/sd(Count))

ggplotly(plot_data %>% ggplot(aes(z_score_orig, norm_count, color=Diagnosis)) + 
         geom_point(alpha=0.8, aes(id=Sample_ID)) + 
         geom_vline(xintercept = -2, color='gray', linetype = 'dotted') + 
         geom_hline(yintercept = 2, color='gray', linetype = 'dotted') + 
         xlab('Currently used metric') + ylab('Standardised number of max |z-score| genes in sample') + 
         theme_minimal())
rm(absadj, netsummary, ku, z.ku, original_z_score)

I think the original metric used to classify samples as outliers agrees better with the PCA plot, although they are quite similar

pca = datExpr %>% t %>% prcomp

pca_data = data.frame('ID' = gsub('X','',colnames(datExpr)), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
                 left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
           dplyr::select(Sample_ID, PC1, PC2) %>% left_join(plot_data, by = 'Sample_ID')

p1 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=-StandardisedCount)) + 
              geom_point(alpha=0.8, aes(id=Sample_ID)) +
              xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
              scale_color_viridis() + theme_minimal() + theme(legend.position = 'none') + 
              coord_fixed())

p2 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=z_score_orig)) +
              geom_point(alpha=0.8, aes(id=Sample_ID)) +
              xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Outlier genes in each sample (left) and Original sample outlier metric (right)') +
              scale_color_viridis() + 
              theme_minimal() + theme(legend.position = 'none') + coord_fixed())

subplot(p1, p2, nrows=1)
rm(pca, pca_data, p1, p2)







Robust z-score metric


\(metric_i = max_j \frac{|x_{i,j} - median(x_i)|}{mad(x_i)}\)

z_scores = datExpr %>% apply(1, function(x) abs(x-median(x))/mad(x)) %>% t %>% data.frame


Outlier genes


Genes with the highest z-score in any of their entries

max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max), 
                          'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$Sample_ID[which.max(x)])) %>%
               left_join(DE_info, by = 'ID')
                            

ENSG00000187951_val = max_z_scores[max_z_scores$ID == 'ENSG00000187951',2]

cat(paste0('Gene ARHGAP11B has a value of ', round(ENSG00000187951_val,1)))
## Gene ARHGAP11B has a value of 176.4
cat(paste0('There are ', sum(max_z_scores$max_z_score>ENSG00000187951_val),
           ' genes with a value higher than this (~',
           round(100*mean(max_z_scores$max_z_score>ENSG00000187951_val),2),'%)'))
## There are 38 genes with a value higher than this (~0.24%)

The dotted line indicates the value for the gene we removed ARHGAP11B (ENSG00000187951)

summary(max_z_scores$max_z_score)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.461    3.056    4.064    8.460    6.861 2414.033
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) + 
                           geom_hline(yintercept = ENSG00000187951_val+1, color='gray', linetype = 'dotted') + 
                           xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
                           ggtitle('Max|z-score| value for each gene') +
                           theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                 legend.position='none', panel.grid.major = element_blank())

ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

rm(p)

Top 20 genes

  • These genes have more than one outlier sample

  • All of the outliers corres pond to ASD samples

  • There seem to be a lot of DE genes

  • The standardised max_z_score values of each of the top genes are huge! Probably because of the really long right tail of this metric’s distribution

plot_function = function(i){
  i = 3*i-2
  plot_list = list()
  for(j in 1:3){
    plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
                           'Diagnosis' = datMeta$Diagnosis)
    colnames(plot_data)[2] = 'expr'
    plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) + 
                              geom_point() + theme_minimal() + 
                              theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.1 , y = 1.05, text =  top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
            mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
            dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)

kable(top_genes, caption='Top 20 genes with the highest max z-score value')
Top 20 genes with the highest max z-score value
ID hgnc_symbol log2FoldChange padj DE max_z_score StandMaxZscore
ENSG00000086570 FAT2 1.2019142 0.0020615 TRUE 2414.0331 75.33339
ENSG00000173110 HSPA6 1.5110674 0.0000000 TRUE 1843.7403 57.47399
ENSG00000132130 LHX1 -0.0584408 0.9152006 FALSE 888.9788 27.57450
ENSG00000128342 LIF 1.3272429 0.0081624 TRUE 794.8874 24.62792
ENSG00000102802 MEDAG 0.0924838 0.7431112 FALSE 613.7384 18.95502
ENSG00000106366 SERPINE1 1.6873843 0.0000002 TRUE 485.6730 14.94450
ENSG00000145863 GABRA6 0.2292027 0.4438112 FALSE 435.8897 13.38548
ENSG00000127954 STEAP4 0.8241022 0.0039420 TRUE 427.6271 13.12673
ENSG00000168542 COL3A1 1.2491085 0.0000001 TRUE 423.9103 13.01033
ENSG00000171219 CDC42BPG 0.5819641 0.1224602 FALSE 406.0434 12.45081
ENSG00000159167 STC1 1.0011532 0.0000165 TRUE 404.1699 12.39213
ENSG00000136689 IL1RN 0.5765227 0.2139752 FALSE 397.2751 12.17622
ENSG00000124107 SLPI 1.4749835 0.0000995 TRUE 396.1959 12.14242
ENSG00000102265 TIMP1 0.7259531 0.0000175 TRUE 392.0005 12.01104
ENSG00000140285 FGF7 0.8934000 0.0061764 TRUE 385.8087 11.81713
ENSG00000136244 IL6 0.3529809 0.5327950 FALSE 379.0638 11.60591
ENSG00000007908 SELE -0.2781207 0.6309733 FALSE 365.5740 11.18346
ENSG00000169429 IL8 0.4413859 0.2408066 FALSE 346.0138 10.57091
ENSG00000139899 CBLN3 0.4478234 0.0619673 FALSE 338.2946 10.32917
ENSG00000129277 CCL4 0.0842550 0.8553859 FALSE 334.7722 10.21887
ENSG00000169245 CXCL10 0.7276849 0.0514795 FALSE 328.0273 10.00764
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)


Outlier samples


The distribution of the genes with the max |z-score| is exactly the same as in the previous section because the two metrics are a monotonic transformation of the entries by row (by gene), so the maximum of each row is reached in the same sample in both metrics


To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution.

The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|

The results are very similar to the ones from the first metric

plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]

# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
  outlier_idx = which(datMeta$Sample_ID==s)
  z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-median(x)))/mad(x))
  plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)

# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$Sample_ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$Sample_ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$Sample_ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples AN04166_BA38, AN04479_BA4_6, AN17254_BA38 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))

# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
           'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
                                      'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
                                      'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
                 melt()  %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
                 mutate(Sample_ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
                                              variable == 'Random Sample 2' ~ rand_samp_2,
                                              variable == 'Random Sample 3' ~ rand_samp_3,
                                              TRUE ~ as.character(variable))) %>%
                 left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis), by = 'Sample_ID')

# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
         xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
   rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)


Metric currently used to filter samples vs z-score


The plot is exactly the same as with the first metric, since the maximums by row happen in the same columns, so the counts by sample are the same in both methods



Direct comparison between metrics

We know that when using the metrics for sample outlier detection they give exactly the same results, but they seem to vary greatly when using them for gene outlier detection

Using a distance of 3 standard deviations to define outliers in both methods:

plot_data = data.frame('ID' = rownames(datExpr),
                       'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
                       'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x)))) %>%
            mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
                   outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
                   alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2)) %>%
            mutate(Outliers = case_when(outliers_orig_z_score & outliers_robust_z_score ~ 'Both',
                                        outliers_orig_z_score & !outliers_robust_z_score ~ 'Only Original',
                                        !outliers_orig_z_score & outliers_robust_z_score ~ 'Only Robust',
                                        TRUE ~ 'Neither')) %>%
            mutate(Outliers = factor(Outliers, levels = c('Neither','Only Original', 'Only Robust', 'Both')))

plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=Outliers)) + geom_point(alpha=plot_data$alpha) +
              geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_smooth(method = 'gam', se = FALSE, color='gray') + 
              ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
              scale_y_log10() + scale_color_viridis_d() + xlab('Original max|z-score|') +
              ylab('Robust max|z-score|') + theme_minimal()

table_info = plot_data %>% apply_labels(outliers_orig_z_score = 'Outliers using Original Metric',
                                        outliers_robust_z_score = 'Outliers using Robust Metric')

cro(table_info$outliers_orig_z_score, list(table_info$outliers_robust_z_score, total()))
 Outliers using Robust Metric     #Total 
 FALSE   TRUE   
 Outliers using Original Metric 
   FALSE  15873 37   15910
   TRUE  183 55   238
   #Total cases  16056 92   16148
rm(table_info)


Relation to DEA Results


Repeating the plot above but colouring genes depending on whether they are DE or not:

  • The robust metric seems to be giving higher values to the DE genes: this could be a problem, for example, for genes which have a strong disregulation in ASD, which could be labelled as outliers and filtered out of the dataset … or maybe it’s the other way around, and the DE algorithm is identifying genes as statistically significant that have many outlier values due to technical artifacts instead of biological significance…
plot_data = data.frame('ID' = rownames(datExpr),
                       'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
                       'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x))),
                       'meanExpr' = log2(rowMeans(datExpr))) %>%
            left_join(DE_info, by = 'ID') %>%
            mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
                   outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
                   alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2))

plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=DE)) + geom_point(alpha=plot_data$alpha) +
              geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_smooth(method = 'gam', se = FALSE) + 
              ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
              scale_y_log10() + xlab('Original max|z-score|') + ylab('Robust max|z-score|') + theme_minimal()

rm(table_info)


If we order the genes by their max|z-score| and calculate the percentage of DE genes using a rolling average, we can see if there’s a change in this percentage of DE genes for different levels of the metric:

This plot corroborates what we were seeing above: The genes with the highest values in the robust metric have a higher density of DE genes than the rest of the genes

This could mean:

  • The metric is capturing genes with a strong biological signal

  • The DE function is capturing genes with technical errors

  • Both?

This wouldn’t be a problem if all the outlier values in these genes didn’t correspond to ASD samples …

Note: The genes in this plot are ordered by their max|z-score| value, so there are actually two orderings of genes in the plot, one for the original metric and the other for the robust metric. That’s why there are different patterns in the % of DE genes for each metric

w = 1000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(original_sort$DE[i:(i+w)], na.rm = TRUE)
  sliding_DE$robust[i] = mean(robust_sort$DE[i:(i+w)], na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(plot_data$DE, na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('% DE genes in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()

rm(w, sliding_DE, original_sort, robust_sort, i)

If we plot now the rolling mean of the |LFC|:

  • For both metrics the LFC increases as the value of the metric increases (this makes sense, because the outliers belong to ASD samples)

  • The |LFC| increases more in the robust version of the metric than in the original

w = 1000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(abs(original_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
  sliding_DE$robust[i] = mean(abs(robust_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(plot_data$log2FoldChange, na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('Mean |LFC| of genes in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()

rm(w, sliding_DE, original_sort, robust_sort, i)

For the first half of the plot (~windows 0-7500): |LFC| remains roughly constant but %DE decreases steadily

  • Even though the change between Diagnoses remains the same, the increase in outlier samples (reflected in the position of the genes in a higher window) decreases the significance of the comparison, resulting in higher (and no longer significant) p-values

  • This means that the presence of outliers decreases the confidence of the DEA


For the second half of the plot (~windows 7500-1500): The %DE genes remains constant using the original metric but increases (maybe exponentially?) using the robust metric. At the same time, the |LFC| increases steadily on both metrics, but at a higher rate in the robust metric

  • For the Original metric we can give a similar conclusion as in the first half of the plot: The more extreme the outliers of a gene are, the more they damage the confidence of the DEA, resulting in genes with very high |LFC| that share the same level of confidence with genes with a not so extreme |LFC|

  • For the Robust metric, both the % of DE genes and the |LFC| increase, which sounds reasonable. The only problem with this is that we don’t know if this increase in DE genes is because of biological signals which are being mistaken for technical effects by the max|z-score|, or technical effects which are confused by biological signals by DESeq2. This problem is specific to this dataset, since consistently, the outlier values correspond to ASD samples


Relation to Level of Expression


If we plot the rolling mean of the mean level of expression of the genes:

  • The level of expression plays an important role in both of the scores (…of course)

  • Genes with lower mean expression tend to have larger max|z-score| values, for both versions of the metric

w = 1000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(original_sort$meanExpr[i:(i+w)], na.rm = TRUE)
  sliding_DE$robust[i] = mean(robust_sort$meanExpr[i:(i+w)], na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(plot_data$meanExpr, na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('Mean Mean Level of Expression in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

rm(w, sliding_DE, original_sort, robust_sort, i)


Conclusions